Communication Patterns within Amusement Park
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns
from itertools import product
df_fri = pd.read_csv("./MC2 2015 Data/comm-data-Fri.csv")
df_sat = pd.read_csv("./MC2 2015 Data/comm-data-Sat.csv")
df_sun = pd.read_csv("./MC2 2015 Data/comm-data-Sun.csv")
df = pd.concat([df_fri, df_sat, df_sun])
df
| Timestamp | from | to | location | |
|---|---|---|---|---|
| 0 | 2014-6-06 08:03:19 | 439105 | 1053224 | Kiddie Land |
| 1 | 2014-6-06 08:03:19 | 439105 | 1696241 | Kiddie Land |
| 2 | 2014-6-06 08:03:19 | 439105 | 580064 | Kiddie Land |
| 3 | 2014-6-06 08:03:19 | 439105 | 1464748 | Kiddie Land |
| 4 | 2014-6-06 08:03:47 | 1836139 | 1593258 | Entry Corridor |
| ... | ... | ... | ... | ... |
| 1548719 | 2014-6-08 23:20:37 | 1983198 | external | Tundra Land |
| 1548720 | 2014-6-08 23:20:38 | 1555391 | 857616 | Tundra Land |
| 1548721 | 2014-6-08 23:21:04 | 839736 | 2022346 | Entry Corridor |
| 1548722 | 2014-6-08 23:22:05 | 839736 | 1109589 | Entry Corridor |
| 1548723 | 2014-6-08 23:23:57 | 1222078 | external | Entry Corridor |
4153329 rows × 4 columns
# Change type of Timestamp to datetime and from to object
df['from'] = df['from'].astype('object')
df['from'] = df['from'].astype('string')
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df["day"] = df["Timestamp"].dt.day
df["hour"] = df["Timestamp"].dt.hour
df["minute"] = df["Timestamp"].dt.minute
#Checks for the sum of duplicated rows
df.duplicated().sum()
1257
#Drop duplicates
df = df.drop_duplicates()
from_counts = df.groupby([df["from"]]).size().reset_index().rename(columns={0: "Count"})
to_counts = df.groupby([df["to"]]).size().reset_index().rename(columns={0: "Count"})
id_counts = pd.merge(from_counts,to_counts,left_on=["from"],right_on=["to"])
id_counts["TotCount"] = id_counts["Count_x"]+id_counts["Count_y"]
id_counts = id_counts.drop(columns=["Count_x","Count_y","to"])
id_counts = id_counts.rename(columns = {"from":"id"})
top20ids = id_counts.sort_values('TotCount',ascending=False).head(20)['id'].tolist()
The display of Scott Jones’s soccer memorabilia in the Creighton Pavilion at DinoFun World was vandalized. A number of the items of memorabilia had been damaged or had been defaced with black spray paint. Spray paint had been used to write derogatory statements about Scott Jones on the display and throughout the Pavilion. Our goal is to figure out the patterns within the communication data to figure out when this crime occurred.
Visualization 1: Relationship between the number of messages sent and received
from_counts = df.groupby(df["from"]).size().reset_index().rename(columns = {0: "FromCount"})
to_counts = df.groupby(df["to"]).size().reset_index().rename(columns = {0: "ToCount"})
totfromto_counts = pd.merge(left = from_counts, right = to_counts, left_on = 'from',right_on = 'to')
totfromto_counts["TotalCount"] = totfromto_counts["FromCount"]+totfromto_counts["ToCount"]
totfromto_counts = totfromto_counts.sort_values("TotalCount",ascending=False)
totfromto_counts = totfromto_counts.iloc[:1000]
totfromto_counts["ID"]=totfromto_counts["from"]
toptexters = totfromto_counts[(totfromto_counts["TotalCount"]>5000) & (totfromto_counts["TotalCount"]<7000)]["ID"].tolist()
multi = alt.selection_point(on='mouseover', nearest=True)
alt.Chart(totfromto_counts.iloc[2:,]).mark_point().encode(
alt.X('FromCount:Q',scale = alt.Scale(domain=[500, 4000])),
alt.Y('ToCount:Q',scale = alt.Scale(domain = [500,3500])),
tooltip=['ID:N'],
color=alt.condition(multi, alt.value('black'), alt.value('lightgray'))
).add_params(
multi
).properties(
title='Relationship between the number of messages sent and received'
).interactive()
The nature of individual IDs can be observed through this plot. The number of messages sent and received is approximately equal, so this plot suggests that there is a balanced exchange of information between the parties involved.
Visualization 2: Total number of messages sent and received per ID
fromhl_counts = df.groupby([df["day"],df["hour"],df["from"]]).size().reset_index().rename(columns={0: "Count"})
tohl_counts = df.groupby([df["day"],df["hour"],df["to"]]).size().reset_index().rename(columns={0: "Count"})
dfviz1 = pd.merge(fromhl_counts,tohl_counts,left_on=["day","hour","from"],right_on=["day","hour","to"])
dfviz1["id"]=dfviz1["from"]
dfviz1["total"]=dfviz1["Count_x"]+dfviz1["Count_y"]
dfviz1 = dfviz1.drop(columns=["from","to"])
#dfviz1 = dfviz1[dfviz1["id"].isin(toptexters)]
dfviz1["Timestamp"] = pd.to_datetime(dict(year=2014, month=6, day=dfviz1["day"], hour=dfviz1["hour"]))
dfviz1 = dfviz1.rename(columns={"Count_x":"Count_from","Count_y":"Count_to"})
dfviz1 = dfviz1.drop(columns=["day","hour"])
dfviz1 = pd.merge(dfviz1,id_counts,left_on=["id"],right_on=["id"])
dfviz1
| Count_from | Count_to | id | total | Timestamp | TotCount | |
|---|---|---|---|---|---|---|
| 0 | 1 | 4 | 1000708 | 5 | 2014-06-06 08:00:00 | 101 |
| 1 | 3 | 3 | 1000708 | 6 | 2014-06-06 09:00:00 | 101 |
| 2 | 1 | 1 | 1000708 | 2 | 2014-06-06 10:00:00 | 101 |
| 3 | 1 | 1 | 1000708 | 2 | 2014-06-06 11:00:00 | 101 |
| 4 | 1 | 4 | 1000708 | 5 | 2014-06-06 12:00:00 | 101 |
| ... | ... | ... | ... | ... | ... | ... |
| 136053 | 1 | 2 | 158818 | 3 | 2014-06-08 20:00:00 | 6 |
| 136054 | 2 | 1 | 1594937 | 3 | 2014-06-08 20:00:00 | 7 |
| 136055 | 3 | 1 | 162882 | 4 | 2014-06-08 20:00:00 | 7 |
| 136056 | 1 | 1 | 1895812 | 2 | 2014-06-08 20:00:00 | 9 |
| 136057 | 1 | 1 | 941716 | 2 | 2014-06-08 23:00:00 | 15 |
136058 rows × 6 columns
alt.data_transformers.disable_max_rows()
interval = alt.selection_interval(encodings=['x'])
selector = alt.Chart(dfviz1).mark_bar().encode(
x = "Timestamp:T",
y = "sum(total)"
).properties(
width=800,
height=50
).add_params(
interval
).properties(
title='Total number of messages sent and received per ID'
)
chart = alt.Chart(dfviz1).mark_bar().transform_filter(
interval
).transform_aggregate(
vizsum ='sum(total)',
totsum = 'median(TotCount)',
groupby=['id']
).encode(
alt.X('id:N').sort('-y'),
y = 'vizsum:Q',
color = 'totsum:Q'
).transform_filter(
'(datum.id != "1278894") & (datum.id != "839736")'
).transform_window(
window=[{'op': 'rank', 'as': 'rank'}],
sort=[{'field': 'vizsum', 'order': 'descending'}]
).transform_filter('datum.rank <= 30').properties(
width=800,
height=300,
).properties(
title='Total number of messages sent and received throughout Friday-Sunday'
)
chart & selector